In [1]:
# Data analysis and Manipulation
import plotly.graph_objs as go
import plotly.io as pio
import plotly.express as px
import pandas as pd
# Data Visualization
import matplotlib.pyplot as plt
# Importing Plotly
import plotly.offline as py
py.init_notebook_mode(connected=True)
# Initializing Plotly
pio.renderers.default = 'colab'
In [2]:
# Importing Dataset1
dataset1 = pd.read_csv("covid.csv")
dataset1.head() # returns first 5 rows
Out[2]:
| Country/Region | Continent | Population | TotalCases | NewCases | TotalDeaths | NewDeaths | TotalRecovered | NewRecovered | ActiveCases | Serious,Critical | Tot Cases/1M pop | Deaths/1M pop | TotalTests | Tests/1M pop | WHO Region | iso_alpha | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | USA | North America | 3.311981e+08 | 5032179 | NaN | 162804.0 | NaN | 2576668.0 | NaN | 2292707.0 | 18296.0 | 15194.0 | 492.0 | 63139605.0 | 190640.0 | Americas | USA |
| 1 | Brazil | South America | 2.127107e+08 | 2917562 | NaN | 98644.0 | NaN | 2047660.0 | NaN | 771258.0 | 8318.0 | 13716.0 | 464.0 | 13206188.0 | 62085.0 | Americas | BRA |
| 2 | India | Asia | 1.381345e+09 | 2025409 | NaN | 41638.0 | NaN | 1377384.0 | NaN | 606387.0 | 8944.0 | 1466.0 | 30.0 | 22149351.0 | 16035.0 | South-EastAsia | IND |
| 3 | Russia | Europe | 1.459409e+08 | 871894 | NaN | 14606.0 | NaN | 676357.0 | NaN | 180931.0 | 2300.0 | 5974.0 | 100.0 | 29716907.0 | 203623.0 | Europe | RUS |
| 4 | South Africa | Africa | 5.938157e+07 | 538184 | NaN | 9604.0 | NaN | 387316.0 | NaN | 141264.0 | 539.0 | 9063.0 | 162.0 | 3149807.0 | 53044.0 | Africa | ZAF |
In [3]:
# Returns tuple of shape (Rows, columns)
print(dataset1.shape)
# Returns size of dataframe
print(dataset1.size)
(209, 17) 3553
In [4]:
# Information about Dataset1
# return concise summary of dataframe
dataset1.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 209 entries, 0 to 208 Data columns (total 17 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Country/Region 209 non-null object 1 Continent 208 non-null object 2 Population 208 non-null float64 3 TotalCases 209 non-null int64 4 NewCases 4 non-null float64 5 TotalDeaths 188 non-null float64 6 NewDeaths 3 non-null float64 7 TotalRecovered 205 non-null float64 8 NewRecovered 3 non-null float64 9 ActiveCases 205 non-null float64 10 Serious,Critical 122 non-null float64 11 Tot Cases/1M pop 208 non-null float64 12 Deaths/1M pop 187 non-null float64 13 TotalTests 191 non-null float64 14 Tests/1M pop 191 non-null float64 15 WHO Region 184 non-null object 16 iso_alpha 209 non-null object dtypes: float64(12), int64(1), object(4) memory usage: 27.9+ KB
In [5]:
# Importing Dataset2
dataset2 = pd.read_csv("covid_grouped.csv")
dataset2.head() # return first 5 rows of dataset2
Out[5]:
| Date | Country/Region | Confirmed | Deaths | Recovered | Active | New cases | New deaths | New recovered | WHO Region | iso_alpha | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2020-01-22 | Afghanistan | 0 | 0 | 0 | 0 | 0 | 0 | 0 | Eastern Mediterranean | AFG |
| 1 | 2020-01-22 | Albania | 0 | 0 | 0 | 0 | 0 | 0 | 0 | Europe | ALB |
| 2 | 2020-01-22 | Algeria | 0 | 0 | 0 | 0 | 0 | 0 | 0 | Africa | DZA |
| 3 | 2020-01-22 | Andorra | 0 | 0 | 0 | 0 | 0 | 0 | 0 | Europe | AND |
| 4 | 2020-01-22 | Angola | 0 | 0 | 0 | 0 | 0 | 0 | 0 | Africa | AGO |
In [6]:
# Returns tuple of shape (Rows, columns)
print(dataset2.shape)
# Returns size of dataframe
print(dataset2.size)
(35156, 11) 386716
In [7]:
# Information about Dataset2
dataset2.info() # return concise summary of dataframe
<class 'pandas.core.frame.DataFrame'> RangeIndex: 35156 entries, 0 to 35155 Data columns (total 11 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Date 35156 non-null object 1 Country/Region 35156 non-null object 2 Confirmed 35156 non-null int64 3 Deaths 35156 non-null int64 4 Recovered 35156 non-null int64 5 Active 35156 non-null int64 6 New cases 35156 non-null int64 7 New deaths 35156 non-null int64 8 New recovered 35156 non-null int64 9 WHO Region 35156 non-null object 10 iso_alpha 35156 non-null object dtypes: int64(7), object(4) memory usage: 3.0+ MB
In [8]:
# Drop NewCases, NewDeaths, NewRecovered rows from dataset1 as they contains NaN values
dataset1.drop(['NewCases', 'NewDeaths', 'NewRecovered'],
axis=1, inplace=True)
# Select random set of values from dataset1
dataset1.sample(5)
Out[8]:
| Country/Region | Continent | Population | TotalCases | TotalDeaths | TotalRecovered | ActiveCases | Serious,Critical | Tot Cases/1M pop | Deaths/1M pop | TotalTests | Tests/1M pop | WHO Region | iso_alpha | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 157 | San Marino | Europe | 33938.0 | 699 | 42.0 | 657.0 | 0.0 | NaN | 20596.0 | 1238.0 | 6068.0 | 178797.0 | Europe | SMR |
| 139 | Uganda | Africa | 45867852.0 | 1223 | 5.0 | 1102.0 | 116.0 | NaN | 27.0 | 0.1 | 288367.0 | 6287.0 | Africa | UGA |
| 43 | Portugal | Europe | 10193593.0 | 52061 | 1743.0 | 37840.0 | 12478.0 | 42.0 | 5107.0 | 171.0 | 1705474.0 | 167308.0 | Europe | PRT |
| 25 | Kazakhstan | Asia | 18798667.0 | 95942 | 1058.0 | 68871.0 | 26013.0 | 221.0 | 5104.0 | 56.0 | 2163713.0 | 115099.0 | Europe | KAZ |
| 68 | El Salvador | North America | 6489514.0 | 19126 | 513.0 | 9236.0 | 9377.0 | 509.0 | 2947.0 | 79.0 | 251271.0 | 38720.0 | Americas | SLV |
In [11]:
# Import create_table Figure Factory
from plotly.figure_factory import create_table
colorscale = [[0, '#4d004c'], [.5, '#f2e5ff'], [1, '#ffffff']]
table = create_table(dataset1.head(15), colorscale=colorscale)
py.iplot(table)
In [12]:
px.bar(dataset1.head(15), x = 'Country/Region',
y = 'TotalCases',color = 'TotalCases',
height = 500,hover_data = ['Country/Region', 'Continent'])
In [13]:
px.bar(dataset1.head(15), x = 'Country/Region', y = 'TotalCases',
color = 'TotalDeaths', height = 500,
hover_data = ['Country/Region', 'Continent'])
In [14]:
px.bar(dataset1.head(15), x = 'Country/Region', y = 'TotalCases',
color = 'TotalRecovered', height = 500,
hover_data = ['Country/Region', 'Continent'])
In [15]:
px.bar(dataset1.head(15), x = 'TotalTests', y = 'Country/Region',
color = 'TotalTests',orientation ='h', height = 500,
hover_data = ['Country/Region', 'Continent'])
In [16]:
px.bar(dataset1.head(15), x = 'TotalTests', y = 'Continent',
color = 'TotalTests',orientation ='h', height = 500,
hover_data = ['Country/Region', 'Continent'])
In [17]:
px.scatter(dataset1, x='Continent',y='TotalCases',
hover_data=['Country/Region', 'Continent'],
color='TotalCases', size='TotalCases', size_max=80)
In [18]:
px.scatter(dataset1.head(57), x='Continent',y='TotalCases',
hover_data=['Country/Region', 'Continent'],
color='TotalCases', size='TotalCases', size_max=80, log_y=True)
In [19]:
px.scatter(dataset1.head(54), x='Continent',y='TotalTests',
hover_data=['Country/Region', 'Continent'],
color='TotalTests', size='TotalTests', size_max=80)
In [20]:
px.scatter(dataset1.head(50), x='Continent',y='TotalTests',
hover_data=['Country/Region', 'Continent'],
color='TotalTests', size='TotalTests', size_max=80, log_y=True)
In [21]:
px.scatter(dataset1.head(100), x='Country/Region', y='TotalCases',
hover_data=['Country/Region', 'Continent'],
color='TotalCases', size='TotalCases', size_max=80)
In [22]:
px.scatter(dataset1.head(30), x='Country/Region', y='TotalCases',
hover_data=['Country/Region', 'Continent'],
color='Country/Region', size='TotalCases', size_max=80, log_y=True)
In [23]:
px.scatter(dataset1.head(10), x='Country/Region', y= 'TotalDeaths',
hover_data=['Country/Region', 'Continent'],
color='Country/Region', size= 'TotalDeaths', size_max=80)
In [24]:
px.scatter(dataset1.head(30), x='Country/Region', y= 'Tests/1M pop',
hover_data=['Country/Region', 'Continent'],
color='Country/Region', size= 'Tests/1M pop', size_max=80)
In [25]:
px.scatter(dataset1.head(30), x='Country/Region', y= 'Tests/1M pop',
hover_data=['Country/Region', 'Continent'],
color='Tests/1M pop', size= 'Tests/1M pop', size_max=80)
In [26]:
px.scatter(dataset1.head(30), x='TotalCases', y= 'TotalDeaths',
hover_data=['Country/Region', 'Continent'],
color='TotalDeaths', size= 'TotalDeaths', size_max=80)
In [27]:
px.scatter(dataset1.head(30), x='TotalCases', y= 'TotalDeaths',
hover_data=['Country/Region', 'Continent'],
color='TotalDeaths', size= 'TotalDeaths', size_max=80,
log_x=True, log_y=True)
In [28]:
px.scatter(dataset1.head(30), x='TotalTests', y= 'TotalCases',
hover_data=['Country/Region', 'Continent'],
color='TotalTests', size= 'TotalTests', size_max=80,
log_x=True, log_y=True)
In [29]:
px.bar(dataset2, x="Date", y="Confirmed", color="Confirmed",
hover_data=["Confirmed", "Date", "Country/Region"], height=400)
In [30]:
px.bar(dataset2, x="Date", y="Confirmed", color="Confirmed",
hover_data=["Confirmed", "Date", "Country/Region"],log_y=True, height=400)
In [31]:
px.bar(dataset2, x="Date", y="Deaths", color="Deaths",
hover_data=["Confirmed", "Date", "Country/Region"],
log_y=False, height=400)
In [32]:
px.choropleth(dataset2,
locations="iso_alpha",
color="Confirmed",
hover_name="Country/Region",
color_continuous_scale="Blues",
animation_frame="Date")
In [33]:
px.choropleth(dataset2,
locations='iso_alpha',
color="Deaths",
hover_name="Country/Region",
color_continuous_scale="Viridis",
animation_frame="Date" )
In [34]:
px.choropleth(dataset2,
locations='iso_alpha',
color="Recovered",
hover_name="Country/Region",
color_continuous_scale="RdYlGn",
projection="natural earth",
animation_frame="Date" )
In [35]:
px.bar(dataset2, x="WHO Region", y="Confirmed", color="WHO Region",
animation_frame="Date", hover_name="Country/Region")
In [36]:
dataset3= pd.read_csv("coviddeath.csv")
dataset3.head()
Out[36]:
| Data as of | Start Week | End Week | State | Condition Group | Condition | ICD10_codes | Age Group | Number of COVID-19 Deaths | Flag | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 08/30/2020 | 02/01/2020 | 08/29/2020 | US | Respiratory diseases | Influenza and pneumonia | J09-J18 | 0-24 | 122.0 | NaN |
| 1 | 08/30/2020 | 02/01/2020 | 08/29/2020 | US | Respiratory diseases | Influenza and pneumonia | J09-J18 | 25-34 | 596.0 | NaN |
| 2 | 08/30/2020 | 02/01/2020 | 08/29/2020 | US | Respiratory diseases | Influenza and pneumonia | J09-J18 | 35-44 | 1521.0 | NaN |
| 3 | 08/30/2020 | 02/01/2020 | 08/29/2020 | US | Respiratory diseases | Influenza and pneumonia | J09-J18 | 45-54 | 4186.0 | NaN |
| 4 | 08/30/2020 | 02/01/2020 | 08/29/2020 | US | Respiratory diseases | Influenza and pneumonia | J09-J18 | 55-64 | 10014.0 | NaN |
In [37]:
dataset3.groupby(["Condition"]).count()
Out[37]:
| Data as of | Start Week | End Week | State | Condition Group | ICD10_codes | Age Group | Number of COVID-19 Deaths | Flag | |
|---|---|---|---|---|---|---|---|---|---|
| Condition | |||||||||
| Adult respiratory distress syndrome | 540 | 540 | 540 | 540 | 540 | 540 | 540 | 272 | 268 |
| All other conditions and causes (residual) | 540 | 540 | 540 | 540 | 540 | 540 | 540 | 363 | 177 |
| Alzheimer disease | 530 | 530 | 530 | 530 | 530 | 530 | 530 | 144 | 386 |
| COVID-19 | 540 | 540 | 540 | 540 | 540 | 540 | 540 | 377 | 163 |
| Cardiac arrest | 520 | 520 | 520 | 520 | 520 | 520 | 520 | 219 | 301 |
| Cardiac arrhythmia | 540 | 540 | 540 | 540 | 540 | 540 | 540 | 192 | 348 |
| Cerebrovascular diseases | 530 | 530 | 530 | 530 | 530 | 530 | 530 | 187 | 343 |
| Chronic lower respiratory diseases | 540 | 540 | 540 | 540 | 540 | 540 | 540 | 229 | 311 |
| Diabetes | 540 | 540 | 540 | 540 | 540 | 540 | 540 | 276 | 264 |
| Heart failure | 540 | 540 | 540 | 540 | 540 | 540 | 540 | 204 | 336 |
| Hypertensive diseases | 540 | 540 | 540 | 540 | 540 | 540 | 540 | 264 | 276 |
| Influenza and pneumonia | 540 | 540 | 540 | 540 | 540 | 540 | 540 | 331 | 209 |
| Intentional and unintentional injury, poisoning, and other adverse events | 520 | 520 | 520 | 520 | 520 | 520 | 520 | 188 | 332 |
| Ischemic heart disease | 540 | 540 | 540 | 540 | 540 | 540 | 540 | 224 | 316 |
| Malignant neoplasms | 540 | 540 | 540 | 540 | 540 | 540 | 540 | 198 | 342 |
| Obesity | 530 | 530 | 530 | 530 | 530 | 530 | 530 | 182 | 348 |
| Other diseases of the circulatory system | 530 | 530 | 530 | 530 | 530 | 530 | 530 | 213 | 317 |
| Other diseases of the respiratory system | 540 | 540 | 540 | 540 | 540 | 540 | 540 | 188 | 352 |
| Renal failure | 540 | 540 | 540 | 540 | 540 | 540 | 540 | 238 | 302 |
| Respiratory arrest | 480 | 480 | 480 | 480 | 480 | 480 | 480 | 111 | 369 |
| Respiratory failure | 540 | 540 | 540 | 540 | 540 | 540 | 540 | 320 | 220 |
| Sepsis | 530 | 530 | 530 | 530 | 530 | 530 | 530 | 243 | 287 |
| Vascular and unspecified dementia | 530 | 530 | 530 | 530 | 530 | 530 | 530 | 191 | 339 |
In [39]:
# import word cloud
from wordcloud import WordCloud
sentences = dataset3["Condition"].tolist()
sentences_as_a_string = ' '.join(sentences)
# Convert the string into WordCloud
plt.figure(figsize=(20, 20))
plt.imshow(WordCloud().generate(sentences_as_a_string))
Out[39]:
<matplotlib.image.AxesImage at 0x2624438e8a0>
In [40]:
column2_tolist= dataset3["Condition Group"].tolist()
# Convert the list to one single string
column_to_string= " ".join(column2_tolist)
# Convert the string into WordCloud
plt.figure(figsize=(20,20))
plt.imshow(WordCloud().generate(column_to_string))
Out[40]:
<matplotlib.image.AxesImage at 0x262441fd910>
In [ ]: